### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2
options(stringsAsFactors=FALSE,
dplyr.summarise.inform=FALSE,
future.globals.maxSize=min(memInKB, 2 * 1024^3),
mc.cores=min(cpus,1),
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R",
"functions_biomart.R",
"functions_degs.R",
"functions_io.R",
"functions_plotting.R",
"functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions
param$debugging_mode = "default_debugging"
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())### Rmarkdown configuration
################################################################################
### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
### R Options
options(citation_format="pandoc",
knitr.table.format="html",
kableExtra_view_html=TRUE)
### Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source='fold-hide', # by default collapse code blocks
dev=c('png', 'svg', 'tiff'), # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
dev.args=list(png = list(type="cairo"), # png: use cairo - works on cluster
tiff = list(type="cairo", compression = 'zip')), # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’
dpi=300, # figure resolution
fig.retina=2, # retina multiplier
fig.path=paste0(file.path(param$path_out, "figures"), "/") # Path for figures in png and pdf format (trailing "/" is needed)
)
### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
### Required libraries
library(magrittr)
### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()
# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
rcug_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","rcug_references.bib"))
invisible(knitcitations::citep(rcug_ref))
### Figure heights
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4# Load renv and virtualenvs
renv::load(file.path(param$path_to_git,"env/basic"))
renv::use_python(type = "virtualenv", name = file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
#reticulate::use_virtualenv(file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(knitr)Gene annotation including Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names, are loaded from a pre-prepared reference file or Ensembl.
## Read gene annotation
# We read gene annotation from file.
# We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names.
### Set reference
################################################################################
if (param$species=="human") {
param$mart_dataset="hsapiens_gene_ensembl"
param$mt = "^MT-"
} else {
if (param$species=="mouse") {
param$mart_dataset="mmusculus_gene_ensembl"
param$mt = "^mt-"
} else {
param$mart_dataset=param$mart_dataset
}
}
if (is.null(param$annot_version)) {
param$annot_version=98
} else {
param$annot_version=param$annot_version
}
param$path_reference=file.path(param$path_to_git, "references", param$mart_dataset, param$annot_version)
param$reference=paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt")
param$file_annot = file.path(param$path_reference, param$reference)
param$file_cc_genes = file.path(param$path_reference, "cell_cycle_markers.xlsx")
### Download reference if not existing
################################################################################
if (!file.exists(param$file_annot) | !file.exists(param$file_cc_genes)) {
source(file.path(param$path_to_git, "modules/download_references/download_references.R"))
}
### read_ensembl_annotation
################################################################################
# Load from file
annot_ensembl = read.delim(param$file_annot)
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Translate IDs
IDs_out = suppressWarnings(TranslateIDs(annot_ensembl, param$annot_main))
ensembl_to_seurat_rowname = IDs_out[[1]]
seurat_rowname_to_ensembl = IDs_out[[2]]
seurat_rowname_to_entrez = IDs_out[[3]]
annot_ensembl = IDs_out[[4]]
### read_cc_genes
################################################################################
# Use biomart to translate human cell cycle genes to the species of interest
# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)
# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))The workflow is appicable to single cell and nuclei RNAseq data pre-processed via 10x Genomics or SmartSeq-2 or for other data that are represented by a simple table with transcript counts per gene and cell. In the first step, a Seurat object of the data is generated and subsequently some metadata are added. Similarly, a Seurat object can be loaded to inspect the stored scRNA seq data.
Here, for the project Testdata, the following data are analysed:
### Read rds object
################################################################################
### Load Seurat S4 objects
# Test if file is defined
if (is.null(param$data)) {
message("Dataset is not specified")
} else {
# Test if file exists
if (file.exists(param$data)) {
# Read object
message(paste0("Load dataset:", param$data))
sc = base::readRDS(param$data)
# Transfer original params to loaded object
if ("parameters" %in% list_names(sc@misc[])) {
orig_param = sc@misc$parameters
param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
param = modifyList(x = param, val = orig_param)
param = modifyList(x = param, val = param_keep)
param = modifyList(x = param, val = param_advset)
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(sc@meta.data[["orig.ident"]])) {
if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples))
sc = sample_colours_out[[1]]
param$col_samples = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(sc@meta.data[["seurat_clusters"]])) {
if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters))
sc = cluster_colours_out[[1]]
param$col_clusters = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(sc@meta.data[["annotation"]])) {
if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation))
sc = annotation_colours_out[[1]]
param$col_annotation = annotation_colours_out[[2]]
}
}
sc
} else {
message("Dataset does not exist")
}
}
### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
# Test if file exists
if (file.exists(param$refdata)) {
# Read object
message(paste0("Load dataset:", param$refdata))
scR = base::readRDS(param$refdata)
# Transfer original params to loaded object
if ("parameters" %in% list_names(scR@misc[])) {
orig_paramR = scR@misc$parameters
if (!is.null(orig_paramR$col_samples)) {
param$col_samples_ref = orig_paramR$col_samples
}
if (!is.null(orig_paramR$col_clusters)) {
param$col_clusters_ref = orig_paramR$col_clusters
}
if (!is.null(orig_paramR$col_annotation)) {
param$col_annotation_ref = orig_paramR$col_annotation
}
param = modifyList(x = param, val = param_advset)
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(scR@meta.data[["orig.ident"]])) {
if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples))
scR = sample_colours_out[[1]]
param$col_samples_ref = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(scR@meta.data[["seurat_clusters"]])) {
if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters))
scR = cluster_colours_out[[1]]
param$col_clusters_ref = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(scR@meta.data[["annotation"]])) {
if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation))
scR = annotation_colours_out[[1]]
param$col_annotation_ref = annotation_colours_out[[2]]
}
}
scR
} else {
message("Reference dataset does not exist")
}
} # Transfer object into a list
sc_original = sc
sc = list()
n = param$project_id
sc[[n]] = sc_original# Download test dataset
param$path_test_dataset=paste0(param$path_to_git, "/modules/download_test_datasets/", param$download_test_datasets, ".R")
if (file.exists(param$path_test_dataset)) {
message(paste0("Using test dataset '", gsub('download_','', param$download_test_datasets), "'."))
# Data output in a data subfolder of the directory where it is run
# Create output directories
if (!file.exists("data")) dir.create("data", recursive=TRUE, showWarnings=FALSE)
setwd(file.path(param$path_to_git,"data"))
source(param$path_test_dataset)
param$path_data = data.frame(name=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
full.names = FALSE, recursive = FALSE),
type=c("10x"),
path=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
full.names = TRUE, recursive = FALSE))
} else {
message("Test dataset does not exist.")
}# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i, "name"]
type = datasets[i, "type"]
path = datasets[i, "path"]
suffix = datasets[i, "suffix"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
} else if (type == "smartseq2") {
# Read counts table into a Seurat object
sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
}
}
# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
sc[[i]][["orig.ident"]] = sample_names[i]
}
# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
nms = unique(as.character(s[[]][["orig.ident"]]))
return(nms)
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")
message("Read scRNA-seq data into Seurat object")
sc## $sample1
## An object of class Seurat
## 8000 features across 500 samples within 1 assay
## Active assay: RNA (8000 features, 0 variable features)
## 1 layer present: counts
##
## $sample2
## An object of class Seurat
## 8000 features across 700 samples within 1 assay
## Active assay: RNA (8000 features, 0 variable features)
## 1 layer present: counts
### Calculate percentages of mitochondrial and ribosomal genes as well as ERCC (qc_calculate_cells)
# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
if (!("percent_mt" %in% colnames(s@meta.data))) {
mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
} else {
return(s)
}
})
# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
if (!("percent_ribo" %in% colnames(s@meta.data))) {
ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
} else {
return(s)
}
})
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())
# Names of all available QC metrics
cell_qc_features = c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)
### Expand filter thresholds to all samples (qc_criteria_create)
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
} else {
return(param$cell_filter)
}
})
param$feature_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
} else {
return(param$feature_filter)
}
})
# List of all filter thresholds per QC metrics
cell_qc_thresholds = purrr::map(cell_qc_features, function(m) {
tresh = purrr::map_dfr(names(param$cell_filter), function(n) {
if (m %in% names(param$cell_filter[[n]])) {
t = data.frame(orig.ident=n, min=param$cell_filter[[n]][[m]][1], max=param$cell_filter[[n]][[m]][2]) %>%
tidyr::pivot_longer(cols=2:3, names_to=c("threshold")) %>%
dplyr::filter(!is.na(value))
t$threshold = factor(t$threshold, levels=c("min", "max"))
return(t)
}
})
})
### Calculate diverse feature caracteristics (qc_calculate_features)
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
# This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
# Calculate percentage of counts per gene in a cell
counts_rna = Seurat::GetAssayData(sc[[n]], layer="counts", assay=param$assay_raw)
total_counts = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
# Calculate feature filters
num_cells_expr = Matrix::rowSums(counts_rna >= 1)
num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
# Calculate median of counts_rna_perc per gene
counts_median = apply(counts_rna_perc, 1, median)
# Add all QC measures as metadata
sc[[n]][[param$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
return(sc[[n]])
})The following first table shows available metadata (columns) of the
first 5 cells (rows). These metadata provide additional information
about the cells in the dataset, such as the sample a cell belongs to
(“orig.ident”), the number of mapped reads (“nCounts_RNA”), the number
of unique genes detected (“nFeature_RNA”), or percentage of
mitochondrial genes (“percent_mt”).
The second table shows available metadata (columns) of the first 5 genes
(rows).
# Print cell metadata
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| orig.ident | nCount_RNA | nFeature_RNA | orig.dataset | percent_mt | percent_ribo | |
|---|---|---|---|---|---|---|
| AACGAAACAGCGTTTA-1 | sample1 | 1901 | 682 | sample1 | 14.571278 | 11.888480 |
| GCTTCACCAACACGAG-1 | sample1 | 9196 | 2070 | sample1 | 8.231840 | 19.214876 |
| TAGTGCACAGAGGAAA-1 | sample1 | 7520 | 1822 | sample1 | 7.140957 | 17.433511 |
| AGAGCAGTCGCGCTGA-1 | sample1 | 2611 | 902 | sample1 | 13.481425 | 32.669475 |
| TGTTCATGTCCTGGGT-1 | sample1 | 2658 | 937 | sample1 | 14.522197 | 27.990971 |
| CTTGAGATCCCGAGGT-1 | sample1 | 287 | 41 | sample1 | 86.411150 | 4.529617 |
# Print gene metadata
knitr::kable(head(sc[[1]][[param$assay_raw]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| feature_id | feature_name | feature_type | num_cells_expr | num_cells_expr_threshold | counts_median | |
|---|---|---|---|---|---|---|
| AL627309.1 | ENSG00000238009 | AL627309.1 | Gene Expression | 1 | 1 | 0 |
| LINC01409 | ENSG00000237491 | AL669831.5 | Gene Expression | 32 | 32 | 0 |
| FAM41C | ENSG00000230368 | FAM41C | Gene Expression | 17 | 17 | 0 |
| KLHL17 | ENSG00000187961 | KLHL17 | Gene Expression | 12 | 12 | 0 |
| PLEKHN1 | ENSG00000187583 | PLEKHN1 | Gene Expression | 1 | 1 | 0 |
Quality control (QC) is an important step of the pre-processing workflow. Here we assess the cell quality, i.e. the cell viability and duplicate rate to determine filter parameter.
These commonly used QC metrics, namely the number of unique genes detected in each cell (“nFeature_”), the total number of molecules detected in each cell (“nCount_”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”), can be used to infer filter thresholds. If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).
In cell QC these covariates are filtered via thresholding as they are a good indicator of cell viability. However, it is crucial to consider the three QC covariates jointly as otherwise it might lead to misinterpretation of cellular signals.
r knitcitations::citet("https://doi.org/10.1093/gigascience/giaa151")
as ambient RNA can confound the number of observed counts and adds
background noise to the data.# Create plot per QC metric
p_list = list()
for (m in cell_qc_features) {
p_list[[m]]= ggplot(sc_cell_metadata[, c("orig.ident", m)], aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]], group=.data[["orig.ident"]])) +
geom_violin(scale="width")
# Adds points for samples with less than three cells since geom_violin does not work here
p_list[[m]] = p_list[[m]] +
geom_point(data=sc_cell_metadata[, c("orig.ident", m)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]]), shape=21, size=2)
# Now add style
p_list[[m]] = p_list[[m]] +
AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
if (nrow(cell_qc_thresholds[[m]]) > 0) {
p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]],
aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value, lty=threshold), colour="firebrick") +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values")
p# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))
# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]])) +
geom_point(size = param$pt_size) +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]])) +
geom_point(size = param$pt_size) +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_ercc (if available)
if ("percent_ercc" %in% names(cell_qc_features)) {
m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]])) +
geom_point(size = param$pt_size) +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
}
# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("QC covariates scatter plots")
if (length(p_list) == 1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
pTo save time and computational resources, we down-sample the cell number before continuing with further quality assessment. According to provided number of samples the cell number is down-sampled to 1000 cells (1 sample), 500 (2-3 samples), or 300 (4 and more samples).
# downsample
# according to samples number to 1000 cells (1 sample), 500 (2-3 samples), or 300 (4 and more samples)
if (length(sc) >= 4) {
param$downsample_cells_n = 300
} else {
if (length(sc) > 1) {
param$downsample_cells_n = 500
} else {
param$downsample_cells_n = 1000
}
}
# downsample_cells_n overwrites downsample_cells_equally
if (!is.null(param$downsample_cells_n)) {
n = param$downsample_cells_n
} else if (param$downsample_cells_equally) {
n = purrr::map_int(sc, ncol) %>% min()
}
# Actual downsampling
if (!is.null(param$downsample_cells_n) | param$downsample_cells_equally) {
sc = purrr::map(sc, function(s) {
cells = ScSampleCells(sc=s, n=n, seed=1)
return(subset(s, cells=cells))
})
# Adjust combined metadata accordingly
sc_cell_metadata = sc_cell_metadata[unlist(purrr::map(sc, Cells)), ]
message(paste0("Your data has been down-sampled to ", param$downsample_cells_n, " cells." ))
print(sc)
}×
(Message)
Your data has been down-sampled to 500
cells.
## $sample1
## An object of class Seurat
## 8000 features across 500 samples within 1 assay
## Active assay: RNA (8000 features, 0 variable features)
## 1 layer present: counts
##
## $sample2
## An object of class Seurat
## 8000 features across 500 samples within 1 assay
## Active assay: RNA (8000 features, 0 variable features)
## 1 layer present: counts
We next plot the genes with the highest median percentage of counts,
i.e. for each cell, we calculate the percentage of counts per gene and
then calculate the median value of these percentages for each gene in
all cells.
Often those genes are mitochondrial or ribosomal genes. Here rather the
percentage of raw counts per gene in a cell and similarity between
samples is of importance. A high percentage of of raw counts of those,
presumably, less interesting genes, suggest to perform scaling of the
data. Differences in percentage of raw counts of those genes between
supposedly similar samples might be a indication for a batch effect.
# Plot only samples that we intend to keep
sc_names = names(sc)[!(names(sc) %in% param$samples_to_drop)]
genes_highestExpr = lapply(sc_names, function(i) {
top_ten_exp = sc[[i]][[param$assay_raw]][["counts_median"]] %>% dplyr::arrange(dplyr::desc(counts_median)) %>% head(n=10)
return(rownames(top_ten_exp))
}) %>%
unlist() %>%
unique()
genes_highestExpr_counts = purrr::map_dfc(sc[sc_names], .f=function(s) s[[param$assay_raw]][["counts_median"]][genes_highestExpr, ])
genes_highestExpr_counts$gene = genes_highestExpr
genes_highestExpr_counts = genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts$name = factor(genes_highestExpr_counts$name, levels=sc_names)
col = GenerateColours(num_colours=length(genes_highestExpr), names=genes_highestExpr, palette="ggsci::pal_simpsons")
p = ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) +
geom_point() +
AddStyle(title="Top 10 highest expressed genes per sample, added into one list",
xlab="Sample", ylab="Median % of raw counts\n per gene in a cell",
legend_position="bottom",
col=col)
if (length(unique(genes_highestExpr_counts$name))>1) p = p + geom_line()
p
Dependent on the normalisation of your choice, we either:
The number of raw sequencing reads per cell is influenced by technical differences in the capture, reverse transcription and sequencing of RNA molecules, particularly due to the difficulty of achieving consistent library preparation with minimal starting material. Thus, comparing gene expression between cells may reveal differences that are solely due to sampling effects. After low-quality cells were removed by filtering, the primary goal of normalization is to remove technical sampling effects while preserving the true biological signal. Hence, normalization of the raw counts accounts for differences in sequencing depth per cell.
The standard log normalisation, a count depth scaling, is the simplest and most commonly used normalization strategy. The underlying idea is that each cell initially contained an equal number of mRNA molecules, and differences arise due to sampling effects. For each cell, the number of reads per gene is divided by a cell-specific “size factor”, which is proportional to the total count depth of the cell. The resulting normalized data add up to 1 per cell, and is then typically multiplied by a factor of 10 (10,000 in this workflow). Finally, normalized data are log-transformed for three important reasons. First, distances between log-transformed expression data represent log fold changes. Log-transformation emphasizes contributions from genes with strong relative differences, for example a gene that is expressed at an average count of 50 in cell type A and 10 in cell type B rather than a gene that is expressed at an average count of 1100 in A and 1000 in B. Second, log-transformation mitigates the relation between mean and variance in the data. Lastly, log-transformation reduces that skewness of the data as many downstream analyses require the data to be normally distributed.r Cite("10.1186/s13059-019-1874-1")).
SCTransform is a new statistical approach for the
modelling, normalization and variance stabilization of single-cell
RNA-seq data. SCTransform models the UMI counts (via a
regularized negative binomial model) to remove the variation due to
sequencing depth and adjusts the variance based on pooling information
across genes with similar abundances and automatically regresses out
sequencing depth (nCounts). Hence, SCTransform can be
applied to 10x (contains UMI counts) but not SmartSeq-2 data. Additional
unwanted sources of variations can be regressed out during
SCTransform.
While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.
# Normalize data the original way
# This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
if (!("data" %in% sc[[param$project_id]][["RNA"]][])) {
source(file.path(param$path_to_git,'modules/pre-processing/normalization.R'))
}message(paste0("Here, the ", ifelse(param$norm=="RNA","Standard log normalization","SCTransform")," method was used and no additional sources of variance regressed out."))×
(Message)
Here, the Standard log normalization
method was used and no additional sources of variance regressed out.
r Cite("10.1186/s13059-019-1874-1")). This method first
models the technical variability as a relationship between mean gene
expression and variance using local polynomial regression. The model is
then used to calculate the expected variance based on the observed mean
gene expression. The difference between the observed and expected
variance is called residual variance and likely reflects biological
variability.fig_height_vf = 5 * ceiling(length(names(sc))/2)# If VariableFeatures not yet present in object
# Find variable features from normalized data (unaffected by scaling)
if (!("scale.data" %in% sc[[param$project_id]][["RNA"]][]) & !param$norm=="SCT") {
sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method="vst", nfeatures=3000, verbose=FALSE)
}
# Plot VariableFeaturePlot
p_list = purrr::map(list_names(sc), function(n) {
top10 = head(Seurat::VariableFeatures(sc[[n]], assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw)), 10)
p = Seurat::VariableFeaturePlot(sc[[n]],
assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw),
selection.method=ifelse(param$norm=="RNA", "vst", "sct"),
col=c("grey", param$col), pt.size = param$pt_size) +
AddStyle(title=n) +
theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
return(p)
})
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Variable genes")
pIf multiple data sets are processed, we merge the datasets now.
if (length(sc) == 1) {
# Default assay is set automatically
sc = sc[[1]]
message("Your dataset contains 1 sample only. No merging/integrating required.")
} else {
source(file.path(param$path_to_git,'modules/pre-processing/combine_samples.R'))
}×
(Message)
Data values for all samples have been
merged. This means that data values have been concatenated, not
integrated.
## An object of class Seurat
## 8000 features across 1000 samples within 1 assay
## Active assay: RNA (8000 features, 3000 variable features)
## 3 layers present: data, counts, scale.data
## 1 dimensional reduction calculated: pca
# Only relevant if data loaded from rds, otherwise they would be scaled and dimensional reduced before
if (!("scale.data" %in% sc[["RNA"]][])) {
# Scale (default)
all_genes = rownames(sc)
sc = suppressMessages(ScaleData(sc, features = all_genes))
}
# Run pca with the variable genes
if (!("pca" %in% list_names(sc@reductions[]))) {
# Run PCA for default normalization
sc = suppressMessages(Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc))))
}To better understand the efficiency of the applied normalization
procedures, we plot the relative log expression of genes in 100 randomly
selected cells per sample before and after normalization. This type of
plot reveals unwanted variation in your data.
The concept is taken from Gandolfo and Speed
(2018). In brief, we remove variation between genes by
calculating for each gene its median expression across all cells, and
then calculate the deviation from this median for each cell. For each
cell, we plot the median expression (black), the interquartile range
(lightgrey),
whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers
(#00468BFF, #ED0000FF).
If expression levels of most genes are supposed to be similar in all
cell types, sample heterogeneity is a sign of unwanted variation.
n_cells_rle_plot = 100
# Sample at most 100 cells per dataset and save their identity
cells_subset = sc[["orig.ident"]] %>% tibble::rownames_to_column() %>%
dplyr::group_by(orig.ident) %>%
dplyr::sample_n(size=min(n_cells_rle_plot, length(orig.ident))) %>%
dplyr::select(rowname, orig.ident)
# Plot raw data
p1 = PlotRLE(as.matrix(log2(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$assay_raw, layer = "counts") + 1)),
id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="log2(raw counts + 1)")
p2 = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$norm, layer = "data")),
id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="Normalised data")
p = p1 / p2
p
Next, we estimate the impact of covariates, such as the percentage of mitochondrial gene expression and cell cycle phase, to evaluate reliability of the applied normalization and scaling method, and determine possibly variables to regress out and integration method.
There can be sources of uninteresting variation in the data that is often specific to the dataset. Hence, checking and removing unwanted variation is another important step in pre-processing so that those artifacts do not drive clustering in the downstream analysis.
Variables provided in vars.to.regress are individually regressed against each feature, and the resulting residuals are then scaled and centered. This step allows controling for factors that may bias clustering.
Count depth effect
Although normalization scales count data to render gene counts
comparable between cells, a count depth effect often remains in the data
as no scaling method can infer the expression values of genes that were
not detected. This count depth effect can be both a biological and a
technical artefact (due to poor sampling).
SCTransform regresses out variation due to the number
of UMIs by default.
Cell cycle score
Cell cycle phase may be a source of significant variation in the data.
It is essential to check, how much the gene expression profiles in the
data reflect the cell cycle phases the single cells were in.
A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes, since expression profiles of different genes are correlated if they are involved in the same biological process. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarize a dataset, and to visualize a dataset.
We use Principal Component Analysis (PCA) to summarize a dataset, overcoming noise and reducing the data to its essential components. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualize the dataset, placing similar cells together in 2D space, see below.
Principal Component Analysis is a way to summarize a dataset and to reduce noise by averaging similar gene expression profiles. The information for correlated genes is compressed into single dimensions called principal components (PCs) and the analysis identifies those dimensions that capture the largest amount of variation. This process gives a more precise representation of the patterns inherent to complex and large datasets.
In a PCA, the first PC captures the greatest variance across the whole dataset. The next PC captures the greatest remaining amount of variance, and so on. This way, the top PCs are likely to represent the biological signal where multiple genes are affected by the same biological processes in a coordinated way. In contrast, random technical or biological noise that affects each gene independently are contained in later PCs. Downstream analyses can be restricted to the top PCs to focus on the most relevant biological signal and to reduce noise and unnecessary computational overhead.Running a PCA on our object, we see how the variance can be
explained.
In case a early PC dimension is split on cell-cycle genes or
mitochondrial gene, we might want to regress this signal from the data,
so that cell-cycle heterogeneity or expression of mitochondrial genes
does not contribute to PCA or downstream analysis. However, remember,
those signals could also always be informative of the biology!
p_list = Seurat::VizDimLoadings(sc, dims=1:12, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(xlab = paste0("PC ",i))
p = patchwork::wrap_plots(p_list, ncol=4) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
pThe feature plots allow to examined the distribution of cells with specific features, here QC covariates (Number of counts and features as well as percent of mitochondrial and ribosomal reads), and, hence, to infer whether those feature might exert some influence on the localisation and distribution of cells in low dimensional space. These plot can clarify the necessity and scope of covariates filter application.
# Generate UMAP
if (!("umap" %in% list_names(sc@reductions[]))) {
# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k))
}# Number of counts
qc_feature = paste0("nCount_", param$assay_raw)
p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature, pt.size = param$pt_size) +
AddStyle(title="Number of counts", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2") +
scale_colour_gradient(low=param$col_bg, high=param$col, trans="log10"))
# Number of features
qc_feature = paste0("nFeature_", param$assay_raw)
p2 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature, pt.size = param$pt_size) +
AddStyle(title="Number of features", legend_position="right", xlab = "", ylab = "") +
scale_colour_gradient(low=param$col_bg, high=param$col, trans="log10"))
# Percent mitochondrial reads
p3 = Seurat::FeaturePlot(sc, features="percent_mt", cols=c(param$col_bg, param$col), pt.size = param$pt_size) +
AddStyle(title="% mitochondrial", legend_position="right", xlab = "", ylab = "")
# Percent ribosomal reads
p4 = Seurat::FeaturePlot(sc, features="percent_ribo", cols=c(param$col_bg, param$col), pt.size = param$pt_size) +
AddStyle(title="% ribosomal", legend_position="right", xlab = "", ylab = "")
#p = ((p2 / p3) | (p4 / p5)) +
# plot_layout(widths = c(2, 2))
p = p1 | p2 | p3 | p4
pThe following plots can help to evaluate whether the cell cycle
phases are a major source of variation in the dataset and the necessity
to regress out cell cycle effects during scaling.
An indication to consider removal of cell cycle effects would be a
strong differences between samples (that are supposed to have a similar
cell composition) or if cells in the same cell cycle phase form very
distinctive clusters. Nevertheless, the final decision also depends on
the biological system and scientific question.
Note that removing all signal associated to cell cycle can negatively impact downstream analysis. Cell cycle signals can be informative of the biology. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. Moreover, biological signals must be understood in context. Dependencies exist between diffferent cellular processes within the organism. Thus, correcting for one process may unintentionally mask the signal of another. Vice versa, due to a correlation of cell size and some transcriptomic effect attributed to the cell cycle (McDavid et al, 2016), correcting for cell size via normalization, also partially corrects for cell cycle effects in scRNA‐seq data.
An alternative approach is to remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences in cell cycle phase amongst proliferating cells (which are often uninteresting), will be removed. For a more detailed explanation, see the cell cycle vignette for Seuratr Cite("https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html").
r Cite("10.1126/science.aad0501"), and human gene symbols
are translated to gene symbols of the species of interest using
biomaRt.# Set up colours for cell cycle effect and add to sc object
col = GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
sc = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")
# Plot umap colored by cell cycle phase
p1 = Seurat::DimPlot(sc, reduction="umap", group.by="Phase", pt.size = param$pt_size) +
AddStyle(title="", xlab = "UMAP 1", ylab = "UMAP 2") +
NoLegend()
# Plot umap colored by cell cycle phase
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="Phase", split.by = "Phase", pt.size = param$pt_size, ncol = 1) +
AddStyle(title="", xlab = "UMAP 1", ylab = "UMAP 2") +
NoLegend()× (Warning) Removing 19 cells missing data for vars requested
# Fraction of cells per cell cycle phase
p3 = ggplot(sc[[]] %>%
dplyr::group_by(orig.ident, Phase) %>%
dplyr::summarise(num_cells=length(Phase)),
aes(x=orig.ident, y=num_cells, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_y_continuous("Fraction of cells") +
AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1, size = 12)) + xlab("")
p = p1 + p2 + plot_spacer() + p3 +
plot_layout(widths = c(3, 1, 1, 1)) +
patchwork::plot_annotation(title="Cell cycle phases")
pIntegration can help to remove biological differences and match
shared cell types and states across datasets.
Whether samples should be combined via merging or integrating is very
context dependent. It is advisable first performing a dimensionaly
reduction on the merged dataset and only proceed to integration if
common cell types separate due to batch effect.
Merging LogNormalized data (1a)
LogNormalized data are merged based according to https://satijalab.org/seurat/archive/v4.3/merge. Merging
will combine the Seurat objects based on the raw count matrices and also
merge the normalized data matrices. Any previously scaled data matrices
are erased. As the results of this layer dependent on the expression
across all cells in an object, the previous values from one object
before merging with another are no longer valied. Therefore, the layer
is removed and the scaling needs to be re-run after merging objects.
Merging SCTransform data (2a, 2c)
The goal of SCTransform is to perform normalization within an experiment
by learning the model of technical noise (within the experiment).
Running SCTransform standardizes the expression of each gene in each
cell relative to all the other cells in the input matrix.
2a) If you have multiple samples that are technically
noisy and you want to remove batch effects that are characterized by
simple shifts in mean expression, it is recommend to run SCTransform on
each sample individually. However, the samples should have roughly the
same celltype compositions. The merged object has then multiple SCT
models and the genes in the scale.data are the intersected genes.
2c) However, scale.data calculated based on multiple
different SCTransform models are not comparable. Performing SCTransform
separately on very diverging samples results in loss of the baseline
differences between them (similar to a gene-wise scaling before
merging). Hence, if samples are biologically heterogeneous or under
different treatments and, therefore, the SCTransform models of each
separate sample very diverging, SCTranform should be (re-)run on merged
samples. This way, SCTransform will learn one model using the median
sequencing depth of the entire dataset to calculate the corrected
counts.
Integration (1b, 2b, 2d)
As integration uses the shared highly variable genes, prior to
integration LogNormalize amd identification of variable genes or
SCTansform is required. Depending on the integration methode,
dementional reduction of the data is needed additionally (e.g. for
reciprocal PCA).
This workflow implements the following integration methods offered by
Seurat offers:
- Anchor-based CCA integration (method=CCAIntegration)
- Anchor-based RPCA integration (method=RPCAIntegration)
Anchor-based integration can be applied to either log-normalized or SCTransform-normalized datasets and can be also performed as reference-based integration where one or a subset of the datasets are listed as a ‘reference’. Reference-based integration, reduces computational time.
CCA-based integration enables indentification of conserved cell types across datasets and integrative analysis when gene expression is shifted e.g. due to experimental conditions, disease states, or even species affiliation. If cell types are present in only one dataset, the cells will appear as a separate sample-specific cluster.
Integration comprises the following steps:To explore the results of data merging and different integration methods, here integration is performed in low-dimensional space (streamlined (one-line) integrative analysis as described in https://satijalab.org/seurat/articles/seurat5_integration).
The following plots offer a low dimension representation of your data combined via the different methods.
### Integrate
# On merged data sets
# sc = merge(x=sc[[1]], y=sc[2:length(sc)], ...)
# sc = JoinLayers(sc)
# sc = RunPCA(sc, ...)
# Split RNA layer(s)
sc[["RNA"]] <- split(sc[["RNA"]], f = sc$orig.ident)
# As in https://satijalab.org/seurat/articles/integration_introduction.html
# and https://satijalab.org/seurat/articles/seurat5_integration
# Integration functions:
integration_function = c("CCAIntegration", "RPCAIntegration")
reduction_name = c("integrated.cca", "integrated.rpca")
if (param$norm == "RNA") {
for (i in seq(integration_function)) {
sc = IntegrateLayers(object = sc,
method = integration_function[i],
orig.reduction = "pca",
normalization.method = "LogNormalize",
features = VariableFeatures(sc),
new.reduction = reduction_name[i],
verbose = FALSE)
sc = RunUMAP(sc, reduction = reduction_name[i], dims = 1:30, reduction.name = paste0(gsub("integrated", "umap", reduction_name[i])))
}
} else if (param$norm == "SCT") {
for (i in seq(integration_function)) {
sc = IntegrateLayers(object = sc,
method = integration_function[i],
normalization.method = "SCT",
features = VariableFeatures(sc),
new.reduction = reduction_name[i],
verbose = FALSE)
sc = RunUMAP(sc, reduction = reduction_name[i], dims = 1:30, reduction.name = paste0(gsub("integrated", "umap", reduction_name[i])))
}
}
# Re-join RNA layer(s)
sc[["RNA"]] = JoinLayers(sc[["RNA"]])### Score plots
# PCA score plots colored by sample
p1 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(1,2)) +
AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p2 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(3,4)) +
AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")
# CCA integrated score plots colored by sample
p3 = Seurat::DimPlot(sc, reduction="integrated.cca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(1,2)) +
AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p4 = Seurat::DimPlot(sc, reduction="integrated.cca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(3,4)) +
AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")
# RPCA integrated score plots colored by sample
p5 = Seurat::DimPlot(sc, reduction="integrated.rpca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(1,2)) +
AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p6 = Seurat::DimPlot(sc, reduction="integrated.rpca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(3,4)) +
AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")
### Umaps
# Plot umap colored by sample
p7 = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size = param$pt_size, cols = param$col_samples) +
AddStyle(title="", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2")
# Plot CCA umap colored by sample
p8 = Seurat::DimPlot(sc, reduction="umap.cca", group.by="orig.ident", pt.size = param$pt_size, cols = param$col_samples) +
AddStyle(title="", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2")
# Plot RPCA umap colored by sample
p9 = Seurat::DimPlot(sc, reduction="umap.rpca", group.by="orig.ident", pt.size = param$pt_size, cols = param$col_samples) +
AddStyle(title="", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2")
# Combine plots
p_list = list()
p_list[[1]] = (p1 / p2)
p_list[[1]] = (p_list[[1]] | p7) +
plot_layout(widths = c(1, 3)) +
plot_annotation(title = 'Merged')
p_list[[2]] = (p3 / p4)
p_list[[2]] = (p_list[[2]] | p8) +
plot_layout(widths = c(1, 3)) +
plot_annotation(title = 'CCA integrated')
p_list[[3]] = (p5 / p6)
p_list[[3]] = (p_list[[3]] | p9) +
plot_layout(widths = c(1, 3)) +
plot_annotation(title = 'RPCA integrated') p_list[[1]]
p_list[[2]]
p_list[[3]]
Now we need to decide how many PCs we want to use for our analyses.
PCs include biological signal as well as noise, and we need to determine the number of PCs with which we include as much biological signal as possible and as little noise as possible. The following “Elbow plot” is designed to help us make an informed decision.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=min(20, ncol(sc))) +
geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) +
AddStyle(title="Elbow plot")
p# Cannot have more PCs than number of cells
param$pc_n = min(param$pc_n, ncol(sc))
The following parameters were used to run the workflow.
out = ScrnaseqParamsInfo(params=param)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")| Name | Value |
|---|---|
| path_to_git | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis |
| scriptname | modules/pre-processing/qc.Rmd |
| author | kosankem |
| assay_raw | RNA |
| downsample_cells_equally | FALSE |
| cell_filter | sample1:nFeature_RNA=c(NA, NA), nCount_RNA=c(NA, NA), percent_mt=c(NA, NA); sample2:nFeature_RNA=c(NA, NA), nCount_RNA=c(NA, NA), percent_mt=c(NA, NA) |
| feature_filter | sample1:min_counts=1, min_cells=5; sample2:min_counts=1, min_cells=5 |
| samples_min_cells | 10 |
| norm | RNA |
| cc_remove | FALSE |
| cc_remove_all | FALSE |
| cc_rescore_after_merge | TRUE |
| experimental_groups | homogene |
| integrate_samples | method:merge |
| pc_n | 30 |
| cluster_k | 20 |
| umap_k | 30 |
| cluster_resolution_test | 0.3, 0.7 |
| cluster_resolution | 0.5 |
| annot_version | 98 |
| annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
| mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
| col_palette_samples | ggsci::pal_lancet |
| col_palette_clusters | ggsci::pal_igv |
| col_palette_annotation | ggsci::pal_igv |
| col | #0086b3 |
| col_bg | #D3D3D3 |
| pt_size | 0.5 |
| project_id | Testdata |
| path_data | name:sample1, sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/ |
| path_out | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/qc |
| species | human |
| debugging_mode | default_debugging |
| mart_dataset | hsapiens_gene_ensembl |
| mt | ^MT- |
| path_reference | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98 |
| reference | hsapiens_gene_ensembl.v98.annot.txt |
| file_annot | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt |
| file_cc_genes | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx |
| col_samples | sample1=#00468BFF, sample2=#ED0000FF |
| downsample_cells_n | 500 |
This report was generated using the modules/pre-processing/qc.Rmd script. Software versions were collected at run time.
out = ScrnaseqSessionInfo(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| Name | Value |
|---|---|
| Run on: | Fri May 17 05:43:37 PM 2024 |
| ktrns/scrnaseq | 9f5e35d534afb1f722e5ece13cedc8bbbb84d3b6 |
| Container | NA |
| R | R version 4.2.1 (2022-06-23) |
| Platform | x86_64-pc-linux-gnu (64-bit) |
| Operating system | Ubuntu 20.04.6 LTS |
| Host name | hpc-rc11 |
| Host OS | #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic) |
| Packages | abind1.4-5, backports1.4.1, beeswarm0.4.0, bibtex0.5.1, bslib0.6.1, cachem1.0.8, cli3.6.2, cluster2.1.3, codetools0.2-18, colorspace2.1-0, cowplot1.1.3, curl5.2.1, data.table1.15.2, deldir2.0-4, digest0.6.34, dotCall641.1-1, dplyr1.1.4, ellipsis0.3.2, evaluate0.23, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, fitdistrplus1.1-11, future1.33.1, future.apply1.11.1, generics0.1.3, ggbeeswarm0.7.2, ggplot23.5.0, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, globals0.16.2, glue1.7.0, goftest1.2-3, gridExtra2.3, gtable0.3.4, highr0.10, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, irlba2.3.5.1, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, lifecycle1.0.4, listenv0.9.1, lmtest0.9-40, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, Matrix1.6-5, matrixStats1.1.0, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, openxlsx4.2.5.2, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, RANN2.6.1, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, RefManageR1.4.0, renv0.16.0, reshape21.4.4, reticulate1.35.0, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, rstudioapi0.15.0, Rtsne0.17, sass0.4.8, scales1.3.0, scattermore1.2, sctransform0.4.1, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shiny1.8.0, sp2.1-3, spam2.10-0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, stringi1.8.3, stringr1.5.1, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, tibble3.2.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridisLite0.4.2, withr3.0.0, xfun0.41, xml21.3.6, xtable1.8-4, yaml2.3.8, zip2.3.1, zoo1.8-12 |
This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).
# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))